#pragma once
#include "cuda.h"
#include "cuda_runtime.h"
#include "cuda_fp16.h"
#include "macro.h"
#include "tensor.h"

void launchCalcPaddingoffset(TensorWrapper<int>* padding_offset,
                             TensorWrapper<int>* cur_sum_seqlens,
                             TensorWrapper<int>* input_lengths
);